{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 导入第三方包\n",
    "import pandas as pd\n",
    "# 读入数据\n",
    "skin = pd.read_excel(r'C:\\Users\\Administrator\\Desktop\\Skin_Segment.xlsx')\n",
    "# 设置正例和负例\n",
    "skin.y = skin.y.map({2:0,1:1})\n",
    "skin.y.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 导入第三方模块\n",
    "from sklearn import model_selection\n",
    "# 样本拆分\n",
    "X_train,X_test,y_train,y_test = model_selection.train_test_split(skin.iloc[:,:3], skin.y, \n",
    "                                                                 test_size = 0.25, random_state=1234)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 调用高斯朴素贝叶斯分类器的“类”\n",
    "gnb = naive_bayes.GaussianNB()\n",
    "# 模型拟合\n",
    "gnb.fit(X_train, y_train)\n",
    "# 模型在测试数据集上的预测\n",
    "gnb_pred = gnb.predict(X_test)\n",
    "# 各类别的预测数量\n",
    "pd.Series(gnb_pred).value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 导入第三方包\n",
    "from sklearn import metrics\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "# 构建混淆矩阵\n",
    "cm = pd.crosstab(gnb_pred,y_test)\n",
    "# 绘制混淆矩阵图\n",
    "sns.heatmap(cm, annot = True, cmap = 'GnBu', fmt = 'd')\n",
    "# 去除x轴和y轴标签\n",
    "plt.xlabel('Real')\n",
    "plt.ylabel('Predict')\n",
    "# 显示图形\n",
    "plt.show()\n",
    "\n",
    "print('模型的准确率为：\\n',metrics.accuracy_score(y_test, gnb_pred))\n",
    "print('模型的评估报告：\\n',metrics.classification_report(y_test, gnb_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 计算正例的预测概率，用于生成ROC曲线的数据\n",
    "y_score = gnb.predict_proba(X_test)[:,1]\n",
    "fpr,tpr,threshold = metrics.roc_curve(y_test, y_score)\n",
    "# 计算AUC的值\n",
    "roc_auc = metrics.auc(fpr,tpr)\n",
    "\n",
    "# 绘制面积图\n",
    "plt.stackplot(fpr, tpr, color='steelblue', alpha = 0.5, edgecolor = 'black')\n",
    "# 添加边际线\n",
    "plt.plot(fpr, tpr, color='black', lw = 1)\n",
    "# 添加对角线\n",
    "plt.plot([0,1],[0,1], color = 'red', linestyle = '--')\n",
    "# 添加文本信息\n",
    "plt.text(0.5,0.3,'ROC curve (area = %0.2f)' % roc_auc)\n",
    "# 添加x轴与y轴标签\n",
    "plt.xlabel('1-Specificity')\n",
    "plt.ylabel('Sensitivity')\n",
    "# 显示图形\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 导入第三方包\n",
    "import pandas as pd\n",
    "# 读取数据\n",
    "mushrooms = pd.read_csv(r'C:\\Users\\Administrator\\Desktop\\mushrooms.csv')\n",
    "# 数据的前5行\n",
    "mushrooms.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 将字符型数据作因子化处理，将其转换为整数型数据\n",
    "columns = mushrooms.columns[1:]\n",
    "for column in columns:\n",
    "    mushrooms[column] = pd.factorize(mushrooms[column])[0]\n",
    "mushrooms.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn import model_selection\n",
    "# 将数据集拆分为训练集合测试集\n",
    "Predictors = mushrooms.columns[1:]\n",
    "X_train,X_test,y_train,y_test = model_selection.train_test_split(mushrooms[Predictors], mushrooms['type'], \n",
    "                                                                 test_size = 0.25, random_state = 10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn import naive_bayes\n",
    "from sklearn import metrics\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "# 构建多项式贝叶斯分类器的“类”\n",
    "mnb = naive_bayes.MultinomialNB()\n",
    "# 基于训练数据集的拟合\n",
    "mnb.fit(X_train, y_train)\n",
    "# 基于测试数据集的预测\n",
    "mnb_pred = mnb.predict(X_test)\n",
    "# 构建混淆矩阵\n",
    "cm = pd.crosstab(mnb_pred,y_test)\n",
    "# 绘制混淆矩阵图\n",
    "sns.heatmap(cm, annot = True, cmap = 'GnBu', fmt = 'd')\n",
    "# 去除x轴和y轴标签\n",
    "plt.xlabel('Real')\n",
    "plt.ylabel('Predict')\n",
    "# 显示图形\n",
    "plt.show()\n",
    "\n",
    "# 模型的预测准确率\n",
    "print('模型的准确率为：\\n',metrics.accuracy_score(y_test, mnb_pred))\n",
    "print('模型的评估报告：\\n',metrics.classification_report(y_test, mnb_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn import metrics\n",
    "# 计算正例的预测概率，用于生成ROC曲线的数据\n",
    "y_score = mnb.predict_proba(X_test)[:,1]\n",
    "fpr,tpr,threshold = metrics.roc_curve(y_test.map({'edible':0,'poisonous':1}), y_score)\n",
    "# 计算AUC的值\n",
    "roc_auc = metrics.auc(fpr,tpr)\n",
    "\n",
    "# 绘制面积图\n",
    "plt.stackplot(fpr, tpr, color='steelblue', alpha = 0.5, edgecolor = 'black')\n",
    "# 添加边际线\n",
    "plt.plot(fpr, tpr, color='black', lw = 1)\n",
    "# 添加对角线\n",
    "plt.plot([0,1],[0,1], color = 'red', linestyle = '--')\n",
    "# 添加文本信息\n",
    "plt.text(0.5,0.3,'ROC curve (area = %0.2f)' % roc_auc)\n",
    "# 添加x轴与y轴标签\n",
    "plt.xlabel('1-Specificity')\n",
    "plt.ylabel('Sensitivity')\n",
    "# 显示图形\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "# 读入评论数据\n",
    "evaluation = pd.read_excel(r'C:\\Users\\Administrator\\Desktop\\Contents.xlsx',sheetname=0)\n",
    "# 查看数据前10行\n",
    "evaluation.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 运用正则表达式，将评论中的数字和英文去除\n",
    "evaluation.Content = evaluation.Content.str.replace('[0-9a-zA-Z]','')\n",
    "evaluation.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 导入第三方包\n",
    "import jieba\n",
    "\n",
    "# 加载自定义词库\n",
    "jieba.load_userdict(r'C:\\Users\\Administrator\\Desktop\\all_words.txt')\n",
    "\n",
    "# 读入停止词\n",
    "with open(r'C:\\Users\\Administrator\\Desktop\\mystopwords.txt', encoding='UTF-8') as words:\n",
    "    stop_words = [i.strip() for i in words.readlines()]\n",
    "\n",
    "# 构造切词的自定义函数，并在切词过程中删除停止词\n",
    "def cut_word(sentence):\n",
    "    words = [i for i in jieba.lcut(sentence) if i not in stop_words]\n",
    "    # 切完的词用空格隔开\n",
    "    result = ' '.join(words)\n",
    "    return(result)\n",
    "# 对评论内容进行批量切词\n",
    "words = evaluation.Content.apply(cut_word)\n",
    "# 前5行内容的切词效果\n",
    "words[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 导入第三方包\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "# 计算每个词在各评论内容中的次数，并将稀疏度为99%以上的词删除\n",
    "counts = CountVectorizer(min_df = 0.01)\n",
    "# 文档词条矩阵\n",
    "dtm_counts = counts.fit_transform(words).toarray()\n",
    "# 矩阵的列名称\n",
    "columns = counts.get_feature_names()\n",
    "# 将矩阵转换为数据框--即X变量\n",
    "X = pd.DataFrame(dtm_counts, columns=columns)\n",
    "# 情感标签变量\n",
    "y = evaluation.Type\n",
    "X.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn import model_selection\n",
    "from sklearn import naive_bayes\n",
    "from sklearn import metrics\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "# 将数据集拆分为训练集和测试集\n",
    "X_train,X_test,y_train,y_test = model_selection.train_test_split(X,y,test_size = 0.25, random_state=1)\n",
    "# 构建伯努利贝叶斯分类器\n",
    "bnb = naive_bayes.BernoulliNB()\n",
    "# 模型在训练数据集上的拟合\n",
    "bnb.fit(X_train,y_train)\n",
    "# 模型在测试数据集上的预测\n",
    "bnb_pred = bnb.predict(X_test)\n",
    "# 构建混淆矩阵\n",
    "cm = pd.crosstab(bnb_pred,y_test)\n",
    "# 绘制混淆矩阵图\n",
    "sns.heatmap(cm, annot = True, cmap = 'GnBu', fmt = 'd')\n",
    "# 去除x轴和y轴标签\n",
    "plt.xlabel('Real')\n",
    "plt.ylabel('Predict')\n",
    "# 显示图形\n",
    "plt.show()\n",
    "\n",
    "# 模型的预测准确率\n",
    "print('模型的准确率为：\\n',metrics.accuracy_score(y_test, bnb_pred))\n",
    "print('模型的评估报告：\\n',metrics.classification_report(y_test, bnb_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 计算正例Positive所对应的概率，用于生成ROC曲线的数据\n",
    "y_score = bnb.predict_proba(X_test)[:,1]\n",
    "fpr,tpr,threshold = metrics.roc_curve(y_test.map({'Negative':0,'Positive':1}), y_score)\n",
    "# 计算AUC的值\n",
    "roc_auc = metrics.auc(fpr,tpr)\n",
    "\n",
    "# 绘制面积图\n",
    "plt.stackplot(fpr, tpr, color='steelblue', alpha = 0.5, edgecolor = 'black')\n",
    "# 添加边际线\n",
    "plt.plot(fpr, tpr, color='black', lw = 1)\n",
    "# 添加对角线\n",
    "plt.plot([0,1],[0,1], color = 'red', linestyle = '--')\n",
    "# 添加文本信息\n",
    "plt.text(0.5,0.3,'ROC curve (area = %0.2f)' % roc_auc)\n",
    "# 添加x轴与y轴标签\n",
    "plt.xlabel('1-Specificity')\n",
    "plt.ylabel('Sensitivity')\n",
    "# 显示图形\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [Root]",
   "language": "python",
   "name": "Python [Root]"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
